{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install -U pip\n",
    "\n",
    "# If you don't have ClearML installed then uncomment this line\n",
    "# ! pip install -U clearml>=0.15.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "! pip install -U torch==1.5.1\n",
    "! pip install -U pandas==1.0.4\n",
    "! pip install -U numpy==1.18.4\n",
    "! pip install -U pathlib2==2.3.5\n",
    "! pip install -U scikit-learn==0.23.1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from collections import Counter\n",
    "from sklearn.model_selection import train_test_split\n",
    "import torch\n",
    "from datetime import datetime\n",
    "from pathlib2 import Path\n",
    "from clearml import Task"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "task = Task.init(project_name=\"Table Example\", task_name=\"tabular preprocessing\")\n",
    "logger = task.get_logger()\n",
    "configuration_dict = {\"test_size\": 0.1, \"split_random_state\": 0}\n",
    "configuration_dict = task.connect(\n",
    "    configuration_dict\n",
    ")  # enabling configuration override by clearml\n",
    "print(\n",
    "    configuration_dict\n",
    ")  # printing actual configuration (after override in remote mode)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download shelter-animal-outcomes dataset (https://www.kaggle.com/c/shelter-animal-outcomes)\n",
    "# This dataset aims to improve understanding trends in animal outcome,\n",
    "# Which could help shelters focus their energy on specific animals who need extra help finding a new home.\n",
    "path_to_ShelterAnimal = \"./data\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_set = pd.read_csv(Path(path_to_ShelterAnimal) / \"train.csv\")\n",
    "logger.report_table(\n",
    "    title=\"Trainset - raw\",\n",
    "    series=\"pandas DataFrame\",\n",
    "    iteration=0,\n",
    "    table_plot=train_set.head(),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# **Pre-processing**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove hour and year from DateTime data\n",
    "timestamp = pd.to_datetime(train_set[\"DateTime\"])\n",
    "months = [d.month for d in timestamp]\n",
    "train_set[\"Month\"] = pd.DataFrame(months).astype(\"object\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "age = train_set[\"AgeuponOutcome\"]\n",
    "months_age = []\n",
    "for val in age:\n",
    "    if pd.isnull(val):\n",
    "        months_age.append(val)\n",
    "    else:\n",
    "        amount, time_type = val.split(\" \")\n",
    "        if \"day\" in time_type:\n",
    "            mult = 1.0 / 30\n",
    "        if \"week\" in time_type:\n",
    "            mult = 1.0 / 4\n",
    "        if \"month\" in time_type:\n",
    "            mult = 1.0\n",
    "        if \"year\" in time_type:\n",
    "            mult = 12.0\n",
    "        months_age.append(int(amount) * mult)\n",
    "train_set[\"Age\"] = pd.DataFrame(months_age).astype(np.float32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sex_neutered = train_set[\"SexuponOutcome\"]\n",
    "sex = []\n",
    "neutered = []\n",
    "for val in sex_neutered:\n",
    "    if pd.isnull(val):\n",
    "        sex.append(val)\n",
    "        neutered.append(val)\n",
    "    elif \"Unknown\" in val:\n",
    "        sex.append(np.nan)\n",
    "        neutered.append(np.nan)\n",
    "    else:\n",
    "        n, s = val.split(\" \")\n",
    "        if n in [\"Neutered\", \"Spayed\"]:\n",
    "            neutered.append(\"Yes\")\n",
    "        else:\n",
    "            neutered.append(\"No\")\n",
    "        sex.append(s)\n",
    "\n",
    "train_set[\"Sex\"] = pd.DataFrame(sex)\n",
    "train_set[\"Neutered\"] = pd.DataFrame(neutered)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove irrelevant columns\n",
    "train_set.drop(\n",
    "    columns=[\n",
    "        \"Name\",\n",
    "        \"OutcomeSubtype\",\n",
    "        \"AnimalID\",\n",
    "        \"DateTime\",\n",
    "        \"AgeuponOutcome\",\n",
    "        \"SexuponOutcome\",\n",
    "    ],\n",
    "    inplace=True,\n",
    ")\n",
    "logger.report_table(\n",
    "    title=\"Trainset - after preprocessing\",\n",
    "    series=\"pandas DataFrame\",\n",
    "    iteration=0,\n",
    "    table_plot=train_set.head(),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## *Fill NA Values*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "object_columns = train_set.select_dtypes(include=[\"object\"]).copy()\n",
    "numerical_columns = train_set.select_dtypes(include=[\"number\"]).copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in object_columns.columns:\n",
    "    if object_columns[col].isnull().sum() > 0:\n",
    "        most_common = Counter(object_columns[col]).most_common(1)[0][0]\n",
    "        print('Column \"{}\": replacing null values with \"{}\"'.format(col, most_common))\n",
    "        train_set[col].fillna(most_common, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in numerical_columns.columns:\n",
    "    if numerical_columns[col].isnull().sum() > 0:\n",
    "        median_val = numerical_columns[col].median()\n",
    "        print('Column \"{}\": replacing null values with \"{}\"'.format(col, median_val))\n",
    "        train_set[col].fillna(median_val, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "logger.report_table(\n",
    "    title=\"Trainset - after filling missing values\",\n",
    "    series=\"pandas DataFrame\",\n",
    "    iteration=0,\n",
    "    table_plot=train_set.head(),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## *Labels Encoding*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "out_encoding = train_set[\"OutcomeType\"].astype(\"category\").cat.categories\n",
    "outcome_dict = {key: val for val, key in enumerate(out_encoding)}\n",
    "task.upload_artifact(\"Outcome dictionary\", outcome_dict)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for col in object_columns.columns:\n",
    "    train_set[col] = train_set[col].astype(\"category\").cat.codes\n",
    "logger.report_table(\n",
    "    title=\"Trainset - after labels encoding\",\n",
    "    series=\"pandas DataFrame\",\n",
    "    iteration=0,\n",
    "    table_plot=train_set.head(),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## *Splitting dataset*"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = train_set.drop(columns=[\"OutcomeType\"])\n",
    "Y = train_set[\"OutcomeType\"]\n",
    "X_train, X_val, Y_train, Y_val = train_test_split(\n",
    "    X,\n",
    "    Y,\n",
    "    test_size=configuration_dict.get(\"test_size\", 0.1),\n",
    "    random_state=configuration_dict.get(\"split_random_state\", 0),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# making all variables categorical\n",
    "object_columns_names = object_columns.drop(columns=[\"OutcomeType\"]).columns\n",
    "for col in object_columns_names:\n",
    "    X[col] = X[col].astype(\"category\")\n",
    "columns_categories = {col: len(X[col].cat.categories) for col in object_columns_names}\n",
    "task.upload_artifact(\"Categries per column\", columns_categories)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df = X_train.join(Y_train)\n",
    "train_df.to_csv(Path(path_to_ShelterAnimal) / \"train_processed.csv\", index=False)\n",
    "val_df = X_val.join(Y_val)\n",
    "val_df.to_csv(Path(path_to_ShelterAnimal) / \"val_processed.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "paths = {\n",
    "    \"train_data\": str(Path(path_to_ShelterAnimal) / \"train_processed.csv\"),\n",
    "    \"val_data\": str(Path(path_to_ShelterAnimal) / \"val_processed.csv\"),\n",
    "}\n",
    "task.upload_artifact(\"Processed data\", paths)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
